{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "import requests\n",
    "import openpyxl\n",
    "from openpyxl import Workbook\n",
    "import pandas as pd\n",
    "from openpyxl.utils.dataframe import dataframe_to_rows\n",
    "import time\n",
    "import pyautogui\n",
    "from docx import Document\n",
    "from openpyxl.styles import Alignment\n",
    "import datetime\n",
    "import re"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Naver Scraping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 페이지 크롤링 중입니다.==================================================\n",
      "2 페이지 크롤링 중입니다.==================================================\n",
      "3 페이지 크롤링 중입니다.==================================================\n",
      "4 페이지 크롤링 중입니다.==================================================\n",
      "5 페이지 크롤링 중입니다.==================================================\n",
      "6 페이지 크롤링 중입니다.==================================================\n",
      "7 페이지 크롤링 중입니다.==================================================\n",
      "8 페이지 크롤링 중입니다.==================================================\n",
      "9 페이지 크롤링 중입니다.==================================================\n",
      "10 페이지 크롤링 중입니다.==================================================\n"
     ]
    }
   ],
   "source": [
    "keyword = pyautogui.prompt(\"검색어를 입력하세요\")\n",
    "lastpage = int(pyautogui.prompt(\"몇 페이지까지 크롤링 할까요?\"))\n",
    "\n",
    "\n",
    "#엑셀 문서 생성\n",
    "wb = Workbook()\n",
    "ws = wb.active\n",
    "\n",
    "ws.column_dimensions['A'].width = 60\n",
    "ws.column_dimensions['B'].width = 60\n",
    "ws.column_dimensions['C'].width = 120\n",
    "ws.column_dimensions['D'].width = 40\n",
    "ws.column_dimensions['E'].width = 40\n",
    "\n",
    "\n",
    "\n",
    "row = 2\n",
    "\n",
    "ws.append(['link','title', 'contents', 'date', 'company'])\n",
    "\n",
    "page_num = 1\n",
    "\n",
    "for i in range(1, lastpage*10,10):\n",
    "\n",
    "    print(f\"{page_num} 페이지 크롤링 중입니다.==================================================\")\n",
    "    response = requests.get(f\"https://search.naver.com/search.naver?where=news&sm=tab_pge&query={keyword}&sort=0&photo=0&field=0&pd=3&ds=2012.01.01&de=2012.01.30&cluster_rank=22&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:from20120101to20120130,a:all&start={i}\")\n",
    "    html = response.text\n",
    "    soup = BeautifulSoup(html, 'html.parser')\n",
    "    articles = soup.select(\"div.info_group\") # 뉴스 기사 crol + f\n",
    "   \n",
    "\n",
    "    for article in articles:\n",
    "        company = article.select_one(\"a.info.press\") # 신문사\n",
    "        links = article.select(\"a.info\") # 리스트\n",
    "        \n",
    "        if len(links) >= 2: # 링크가 2개 이상이면\n",
    "            url = links[1].attrs['href'] # 두번째 링크의 href를 추출\n",
    "            response = requests.get(url, headers = {'User-agent': 'Mozila/5.0'})\n",
    "            html = response.text\n",
    "            soup_sub = BeautifulSoup(html, 'html.parser')\n",
    "\n",
    "\n",
    "            # 만약 연예 뉴스라면\n",
    "            if \"entertain\" in response.url: # 리다이렉션 되었기 때문에 response.를 넣어줌\n",
    "                if soup_sub.select_one(\".end_tit\") == None:\n",
    "                    pass\n",
    "                else:\n",
    "                    title = soup_sub.select_one(\".end_tit\")\n",
    "                \n",
    "                if soup_sub.select_one(\"#articeBody\") == None:\n",
    "                    pass\n",
    "                else:\n",
    "                    contents = soup_sub.select_one(\"#articeBody\")\n",
    "                    \n",
    "                if soup_sub.select_one(\"div.article_info > span > em\") == None:\n",
    "                    pass\n",
    "                else:\n",
    "                    date = soup_sub.select_one(\"div.article_info > span > em\")\n",
    "\n",
    "            \n",
    "            elif \"sports\" in response.url:\n",
    "                if soup.select_one(\"h4.title\") == None:\n",
    "                    pass\n",
    "                else:\n",
    "                    title = soup.select_one(\"h4.title\")\n",
    "                \n",
    "                if soup.select_one(\"#newsEndContents\") == None:\n",
    "                    pass\n",
    "                else:                \n",
    "                    contents = soup.select_one(\"#newsEndContents\")\n",
    "                    dives = content.select(\"div\") # 본문 내용 안의 불필요한 내용 삭제, 이메일\n",
    "                \n",
    "                    for div in dives:\n",
    "                        div.decompose()\n",
    "                \n",
    "                    paragraphs = content.select(\"p\")\n",
    "                    for p in paragraphs:\n",
    "                        p.decompose()\n",
    "                if soup_sub.select_one(\"div.article_info > span > em\") == None:\n",
    "                    pass\n",
    "                else:\n",
    "                    date = soup_sub.select_one(\"div.article_info > span > em\")\n",
    "            \n",
    "            \n",
    "            else:\n",
    "                if soup_sub.select_one(\".media_end_head_headline\") == None:\n",
    "                    pass\n",
    "                else:\n",
    "                    title = soup_sub.select_one(\".media_end_head_headline\")\n",
    "                    \n",
    "                if soup_sub.select_one(\"#dic_area\") == None:\n",
    "                    pass\n",
    "                else:\n",
    "                    contents = soup_sub.select_one(\"#dic_area\")       \n",
    "                    \n",
    "                if soup_sub.select_one(\"span.media_end_head_info_datestamp_time._ARTICLE_DATE_TIME\") == None:\n",
    "                    pass\n",
    "                else:\n",
    "                    date = soup_sub.select_one(\"span.media_end_head_info_datestamp_time._ARTICLE_DATE_TIME\")\n",
    "\n",
    "#            print(\"==========링크==========\\n\", url)\n",
    "#            print(\"==========제목==========\\n\", title.text.strip())       \n",
    "#            print(\"==========본문==========\\n\", contents.text.strip())\n",
    "#            print(\"=======날짜======= \\n\", date)\n",
    "#            print(\"=======신문사======= \\n\", company.text.strip())\n",
    "            \n",
    "            #워드에 제목, 링크, 본문 저장\n",
    "            ws[f'A{row}'] = url\n",
    "            ws[f'B{row}'] = title.text.strip()\n",
    "            ws[f'C{row}'] = contents.text.strip()\n",
    "            ws[f'D{row}'] = date.text.strip()[:7]\n",
    "            ws[f'E{row}'] = company.text.strip()\n",
    "            \n",
    "            row = row + 1\n",
    "            \n",
    "            time.sleep(0.5)\n",
    "       \n",
    "    \n",
    "    page_num = page_num + 1\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "wb.save(r\"\\df.xlsx\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
